### Robustness check: Descriptive statistics for different samples:

####### Cleaning working space
rm(list = ls(all.names = TRUE)) #will clear all objects includes hidden objects.
gc() #free up memrory and report the memory usage.

library(dplyr)

###########################################
####### Setting working environment --------
data <- read.csv(file = "data_11.csv", row.names = NULL)

# Flag those interviewed
data$flag_interview <- as.numeric(is.na(data$id_interview))
table(data$flag_interview)


# Flag those with complete grades
table(data$missing_wo_edfis)
data$flag_grades <-  as.numeric(data$missing_wo_edfis != 0)
table(data$flag_grades)

# Flag without network:
## I dont have networks for a classroom in one school, and I am excluding EJA and children older than 20 years old
data$flag_one  <- as.numeric(data$school_id == 1 & data$class_id == 10)
    
first_criteria <- which(data$young_adult_educ == 1 | data$age > 20) 
data$flag_one[first_criteria] <- 1

table(data$flag_one)

# Flag on final conditional in administrative records:
# Set which categories of situation to exclude:
criteria_situation = c(3, 4, 5) # Failed by absense, abandon, transfer:
    
second_criteria <- which(data$school_situation %in% criteria_situation) 
    
data$flag_two <- as.numeric(data$school_situation %in% criteria_situation)
table(data$flag_two)
    
# Flag on quality of classroom (70% response and 10 observations)
# Fourth criteria: Exclude classes with a certain percantage of missing AFTER ACCOUNTING FOR THE OTHER TWO CRITERIA:
data$flag <- as.numeric(data$flag_grades == 1 | data$flag_one == 1 | data$flag_two == 1)
table(data$flag)

data$valid_non_missing <- 0
data$valid_non_missing[which(is.na(data$id_interview) == F & data$flag == 0)] <- 1
table(data$valid_non_missing)
table(data$valid_non_missing, is.na(data$id_interview), useNA = "always")
table(data$valid_non_missing, data$flag_grades, useNA = "always")

overall_classes <- group_by(data, school_id, class_id) %>%
  summarise("Total" = n(), "Valid" = n() - sum(flag), "Non-missing" = sum(!is.na(id_interview)), 
            "Share Valid Non-Missing" = sum(!is.na(id_interview))/(n() - sum(flag)))

#View(overall_classes)

bad_class_70_10 <- which(overall_classes[,4] < 10 | overall_classes[,6] < .7)    
bad_class_80_10 <- which(overall_classes[,4] < 10 | overall_classes[,6] < .8)    
bad_class_50_10 <- which(overall_classes[,4] < 10 | overall_classes[,6] < .5)    
bad_class_70_2 <- which(overall_classes[,4] < 2 | overall_classes[,6] < .7)    
bad_class_80_2 <- which(overall_classes[,4] < 2 | overall_classes[,6] < .8)    
length(bad_class_70_10)
length(bad_class_80_10)

data$flag_class_70_10 <- 0
data$flag_class_80_10 <- 0
data$flag_class_50_10 <- 0
data$flag_class_70_2 <- 0
data$flag_class_80_2 <- 0

for (i in 1:length(bad_class_70_10)) {
  # i = 1
  to_replace <-which(data$school_id == as.numeric(overall_classes[bad_class_70_10[i],1]) &
                       data$class_id == as.numeric(overall_classes[bad_class_70_10[i],2]))
  
  data$flag_class_70_10[to_replace] <- 1
  data$flag_class_70_10[which(data$flag == 1)] <- 1
}

for (i in 1:length(bad_class_80_10)) {
  # i = 1
  to_replace <-which(data$school_id == as.numeric(overall_classes[bad_class_80_10[i],1]) &
                       data$class_id == as.numeric(overall_classes[bad_class_80_10[i],2]))
  
  data$flag_class_80_10[to_replace] <- 1
  data$flag_class_80_10[which(data$flag == 1)] <- 1
}

for (i in 1:length(bad_class_50_10)) {
  # i = 1
  to_replace <-which(data$school_id == as.numeric(overall_classes[bad_class_50_10[i],1]) &
                       data$class_id == as.numeric(overall_classes[bad_class_50_10[i],2]))
  
  data$flag_class_50_10[to_replace] <- 1
  data$flag_class_50_10[which(data$flag == 1)] <- 1
  
}

for (i in 1:length(bad_class_70_2)) {
  # i = 1
  to_replace <-which(data$school_id == as.numeric(overall_classes[bad_class_70_2[i],1]) &
                       data$class_id == as.numeric(overall_classes[bad_class_70_2[i],2]))
  
  data$flag_class_70_2[to_replace] <- 1
  data$flag_class_70_2[which(data$flag == 1)] <- 1
}

for (i in 1:length(bad_class_80_2)) {
  # i = 1   
  to_replace <-which(data$school_id == as.numeric(overall_classes[bad_class_80_2[i],1]) &
                       data$class_id == as.numeric(overall_classes[bad_class_80_2[i],2])) 
  
  data$flag_class_80_2[to_replace] <- 1
  data$flag_class_80_2[which(data$flag == 1)] <- 1
  
}

# Flag for final sample:
lista_alunos_deletar <- 
  readRDS("intermediary_outputs/students_id_to_delete.RData") 

data$flag_select_sample <- 0
data$flag_select_sample[lista_alunos_deletar] <- 1

data$flag_final <-  0
data$flag_final[which(data$flag_select_sample == 1 | data$race >= 4 | is.na(data$race) == T )] <- 1

table(data$flag_class_70_10)
table(data$flag_class_80_10)
table(data$flag_class_50_10)
table(data$flag_class_80_2)
table(data$flag_class_70_2)
table(data$flag_final)


saveRDS(data, "intermediary_outputs/robustness/data_step_descriptive.Rds")  
  